{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "10fe2a5f-6e34-413f-b7cf-82c1c67b8686",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import nltk\n",
    "import re\n",
    "from nltk.corpus import stopwords\n",
    "from gensim.utils import simple_preprocess\n",
    "from gensim.models.ldamodel import LdaModel\n",
    "import gensim.corpora as corpora\n",
    "from pprint import pprint\n",
    "from gensim.corpora import MmCorpus"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "c88f6f36-95b5-49ba-9835-279c9c005543",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "           category                                               text\n",
      "0              tech  tv future in the hands of viewers with home th...\n",
      "1          business  worldcom boss  left books alone  former worldc...\n",
      "2             sport  tigers wary of farrell  gamble  leicester say ...\n",
      "3             sport  yeading face newcastle in fa cup premiership s...\n",
      "4     entertainment  ocean s twelve raids box office ocean s twelve...\n",
      "...             ...                                                ...\n",
      "2220       business  cars pull down us retail figures us retail sal...\n",
      "2221       politics  kilroy unveils immigration policy ex-chatshow ...\n",
      "2222  entertainment  rem announce new glasgow concert us band rem h...\n",
      "2223       politics  how political squabbles snowball it s become c...\n",
      "2224          sport  souness delight at euro progress boss graeme s...\n",
      "\n",
      "[2225 rows x 2 columns]\n"
     ]
    }
   ],
   "source": [
    "stop_words = stopwords.words('english')\n",
    "stop_words.append(\"said\")\n",
    "bbc_df = pd.read_csv(\"../data/bbc-text.csv\")\n",
    "print(bbc_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "32d57a74-0547-4774-ad98-f32d1a5b7e06",
   "metadata": {},
   "outputs": [],
   "source": [
    "def clean_text(input_string):\n",
    "    input_string = re.sub(r'[^\\w\\s]', ' ', input_string)\n",
    "    input_string = re.sub(r'\\d', '', input_string)\n",
    "    input_list = simple_preprocess(input_string)\n",
    "    input_list = [word for word in input_list if word not in stop_words]\n",
    "    return input_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "5c32542c-a413-4654-bf20-67a93468d4f0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "           category                                               text\n",
      "0              tech  [tv, future, hands, viewers, home, theatre, sy...\n",
      "1          business  [worldcom, boss, left, books, alone, former, w...\n",
      "2             sport  [tigers, wary, farrell, gamble, leicester, say...\n",
      "3             sport  [yeading, face, newcastle, fa, cup, premiershi...\n",
      "4     entertainment  [ocean, twelve, raids, box, office, ocean, twe...\n",
      "...             ...                                                ...\n",
      "2220       business  [cars, pull, us, retail, figures, us, retail, ...\n",
      "2221       politics  [kilroy, unveils, immigration, policy, ex, cha...\n",
      "2222  entertainment  [rem, announce, new, glasgow, concert, us, ban...\n",
      "2223       politics  [political, squabbles, snowball, become, commo...\n",
      "2224          sport  [souness, delight, euro, progress, boss, graem...\n",
      "\n",
      "[2225 rows x 2 columns]\n"
     ]
    }
   ],
   "source": [
    "bbc_df['text'] = bbc_df['text'].apply(lambda x: clean_text(x))\n",
    "print(bbc_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "df12d635-b789-4948-9fdd-d61b4a7df599",
   "metadata": {},
   "outputs": [],
   "source": [
    "texts = bbc_df['text'].values\n",
    "id_dict = corpora.Dictionary(texts)\n",
    "corpus = [id_dict.doc2bow(text) for text in texts]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "df85a4d2-f87f-413e-ac16-1b82946772cb",
   "metadata": {},
   "outputs": [],
   "source": [
    "num_topics = 5\n",
    "lda_model = LdaModel(corpus=corpus,\n",
    "                     id2word=id_dict,\n",
    "                     num_topics=num_topics, \n",
    "                     chunksize=100,\n",
    "                     passes=20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "46499af5-6650-4026-b190-a86f6c5ca136",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(0,\n",
      "  '0.010*\"people\" + 0.006*\"mobile\" + 0.005*\"one\" + 0.005*\"new\" + '\n",
      "  '0.005*\"technology\" + 0.004*\"also\" + 0.004*\"use\" + 0.004*\"net\" + '\n",
      "  '0.004*\"many\" + 0.004*\"digital\"'),\n",
      " (1,\n",
      "  '0.007*\"game\" + 0.006*\"first\" + 0.005*\"time\" + 0.005*\"year\" + '\n",
      "  '0.005*\"england\" + 0.005*\"world\" + 0.005*\"win\" + 0.005*\"last\" + '\n",
      "  '0.005*\"players\" + 0.005*\"one\"'),\n",
      " (2,\n",
      "  '0.013*\"bn\" + 0.013*\"us\" + 0.011*\"year\" + 0.009*\"sales\" + 0.006*\"market\" + '\n",
      "  '0.006*\"company\" + 0.005*\"growth\" + 0.005*\"bank\" + 0.005*\"firm\" + '\n",
      "  '0.005*\"also\"'),\n",
      " (3,\n",
      "  '0.009*\"film\" + 0.009*\"blair\" + 0.008*\"party\" + 0.008*\"best\" + 0.006*\"also\" '\n",
      "  '+ 0.006*\"one\" + 0.005*\"year\" + 0.005*\"show\" + 0.004*\"new\" + 0.004*\"us\"'),\n",
      " (4,\n",
      "  '0.020*\"mr\" + 0.013*\"would\" + 0.008*\"government\" + 0.007*\"people\" + '\n",
      "  '0.006*\"labour\" + 0.005*\"could\" + 0.005*\"election\" + 0.005*\"minister\" + '\n",
      "  '0.004*\"told\" + 0.004*\"also\"')]\n"
     ]
    }
   ],
   "source": [
    "pprint(lda_model.print_topics())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "480bac4b-9019-4c5e-bb24-970744ea9895",
   "metadata": {},
   "outputs": [],
   "source": [
    "def save_model(lda, lda_path, id_dict, dict_path, corpus, corpus_path):\n",
    "    lda.save(lda_path)\n",
    "    id_dict.save(dict_path)\n",
    "    MmCorpus.serialize(corpus_path, corpus)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "8a30e9ec-ec40-4587-95ce-5ac0339f8874",
   "metadata": {},
   "outputs": [],
   "source": [
    "model_path = \"../models/bbc_gensim/lda.model\"\n",
    "dict_path = \"../models/bbc_gensim/id2word.dict\"\n",
    "corpus_path = \"../models/bbc_gensim/corpus.mm\"\n",
    "save_model(lda_model, model_path, id_dict, dict_path, corpus, corpus_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "5585810e-a303-4d1a-85e7-798ed703ff75",
   "metadata": {},
   "outputs": [],
   "source": [
    "lda_model = LdaModel.load(model_path)\n",
    "id_dict = corpora.Dictionary.load(dict_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "78b7976b-23f8-4084-8570-748cfc81035b",
   "metadata": {},
   "outputs": [],
   "source": [
    "new_example = \"\"\"Manchester United players slumped to the turf \n",
    "at full-time in Germany on Tuesday in acknowledgement of what their \n",
    "latest pedestrian first-half display had cost them. The 3-2 loss at \n",
    "RB Leipzig means United will not be one of the 16 teams in the draw \n",
    "for the knockout stages of the Champions League. And this is not the \n",
    "only price for failure. The damage will be felt in the accounts, in \n",
    "the dealings they have with current and potentially future players \n",
    "and in the faith the fans have placed in manager Ole Gunnar Solskjaer. \n",
    "With Paul Pogba's agent angling for a move for his client and ex-United \n",
    "defender Phil Neville speaking of a \"witchhunt\" against his former team-mate \n",
    "Solskjaer, BBC Sport looks at the ramifications and reaction to a big loss for United.\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "6d81a0ed-d606-469c-b93f-c916971c1bce",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(1, 0.7338447), (2, 0.15261793), (4, 0.1073401)]\n"
     ]
    }
   ],
   "source": [
    "input_list = clean_text(new_example)\n",
    "bow = id_dict.doc2bow(input_list)\n",
    "topics = lda_model[bow]\n",
    "print(topics)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ac8982fe-3067-4c3a-9a06-ae6ac289a9ee",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
